# Necessary imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import IFrame, HTML
from wordcloud import WordCloud, STOPWORDS
# Read all necessary data
df = pd.read_csv('../data/complete_data.tsv', sep='\t')
data_stats_org = np.load('../data/stats/data_stats_org.npy', allow_pickle=True).item()
data_stats_resp = np.load('../data/stats/data_stats_resp.npy', allow_pickle=True).item()
data_stats_topic = np.load('../data/stats/data_stats_topic.npy', allow_pickle=True).item()
data_stats_sent = np.load('../data/stats/sent_stats.npy', allow_pickle=True).item()
data_stats_author = pd.read_csv('../data/stats/data_stats_author.tsv', sep='\t')
data_stats_total = pd.read_csv('../data/stats/data_stats_total.tsv', sep='\t')
data_nix_ken = pd.read_csv('../data/stats/data_nix_ken.tsv', sep='\t')
# General stats of all datasets
# Number of unique arguments, number of total pairs, number of attacks/supports und unrelated pairs
# Statistics about the total length (org+response) of the pairs
# Important: debate_test/train is already repaired, but still not the same as in the paper
# Important: agreement had many rows which could not get parsed, e.g. because resp or org was empty, they were excluded
# And the dataset is smaller than reported in the paper
# Important: There are two duplicates in the political dataset
# Length important for the seq_len parameter of BERT
data_stats_total.loc[data_stats_total['dataset'].isin(['debate_test', 'debate_train', 'procon', 'political', 'agreement'])]
# Debate train/test by topic
# Topics that are not matching paper are Interentaccess and Militaryservice
# Most topics attack/support distributions are similar to the overall distribution
pd.concat((data_stats_topic['debate_train'], data_stats_topic['debate_test']), keys=['train', 'test'])
# Political by topic
# Most topics have a similar distribution, minimum wage is an exception
data_stats_topic['political']
# Political by author
# Same author mostly support each other
# Different authors mostly attack each other
# Dataset is heavily imbalanced in respect to the author, Kennedy occurs way more often
print(data_nix_ken.groupby("author").nunique())
data_stats_author.style.background_gradient(cmap='Blues')
# Political duplicates
for data_set in ['political']:
print(data_set + " Duplicates:")
df_check = df[df['org_dataset'] == data_set]
print(df_check[df_check.duplicated(subset=['org', 'response'], keep=False)])
# Plot distribution of length of org, resp and combined over the different datasets
# Seq_len 128/200 ~75% of debate_dataset, 250 ~75% political_dataset
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(10,4)) # 1 row, 2 columns
for data_set, ax in [('debate_extended', ax1), ('political',ax2)]:
df_plot = df[df['org_dataset'] == data_set]
df_plot.boxplot(ax=ax)
ax.set_title(data_set)
plt.tight_layout()
# Plot how many arguments attack an argument (attack-ratio)
# Most arguments are only attacked or only supported (interesting for detecting arguments likely to be attacked/supported)
# If we disregard every argument, which is only answered to once most arguments have an attack-ratio of 0.5
# In the case of the political dataset many arguments are unrelated, and unrelated arguments are disregarded in this plot
fig, (ax1,ax2) = plt.subplots(2,2, figsize=(10,4)) # 2 rows, 2 columns
for data_set, ax in [('debate_extended', ax1), ('political',ax2)]:
df_plot = data_stats_org[data_set].iloc[:-1].apply(
lambda r: pd.Series({"Attack-ratio": r.attacked / r.tot,
"Attack-ratio (exluding arguments only attacked/supported once)": np.nan if r.tot == 1 else r.attacked / r.tot}),
axis=1)
# Ratio broken?
df_plot.hist(density=False, ax=ax)
ax[0].set_ylabel(data_set, rotation=0)
plt.tight_layout()
# First column shows how many answers an argument has
# Second column shows how many outgoing links an argument has
# Most arguments only have one ingoing link, but some have many ~10 debate, ~30 political
# In debate (orginal) every argument only has one outgoing link, in political most have one, but some have many ~8
fig, (ax1,ax2,ax3) = plt.subplots(3,2, figsize=(10,4)) # 3 rows, 2 columns
for data_set, ax in [('debate_test', ax1), ('debate_extended', ax2), ('political',ax3)]:
df_plot = data_stats_org[data_set].iloc[:-1]
df_plot = df_plot['tot']
# Ratio broken?
df_plot.hist(density=True, ax=ax[0])
ax[0].set_title('{0}, org'.format(data_set))
ax[1].set_title('{0}, resp'.format(data_set))
df_plot = data_stats_resp[data_set].iloc[:-1]
df_plot = df_plot['tot']
# Ratio broken?
df_plot.hist(bins=np.arange(0, 10), ax=ax[1])
plt.tight_layout()
# Scattertext of the responses in debate_train
# No special "attacking" or "supporting" words easily recognizable
# The words are either topic specific, e.g. China (in topic Chinaonechildpolicy there are more supports than attacks)
# Or they seem to be there by chance (small dataset), e.g. he, does
IFrame(src='./scattertext_attack_supportdebate_train.html', width=950, height=500)
# Lime Visualization
# Some of the words play an influence as expected, e.g. are and not (attack), play, and alcohol (support)
# Others do not play the expected influence, e.g. china (attack and not support as expected)
# Overall, all weights are really small and the removal/replacement with UNK of a single word
# does not change the prediction
HTML(filename='./lime.html')
# Anchor Visualization
# Anchor did not find a way to change some words, and then to predict the other class
HTML(filename='./anchor.html')
# Wordclouds for kennedy and for nixon
# Both often say the name of the other candidate, Nixon talks about Predisdent Eisenhower
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(12,10)) # 1 row, 2 columns
stopwords = set(STOPWORDS) # set(STOPWORDS)
wordcloud = WordCloud(
stopwords=stopwords).generate(
" ".join(text for text in data_nix_ken.loc[data_nix_ken["author"] == 'Nixon', 'text']))
ax1.imshow(wordcloud, interpolation="bilinear")
ax1.set_title("Nixon WordCloud")
ax1.set_axis_off()
wordcloud = WordCloud(
stopwords=stopwords).generate(
" ".join(text for text in data_nix_ken.loc[data_nix_ken["author"] == 'Kennedy', 'text']))
ax2.imshow(wordcloud, interpolation="bilinear")
ax2.set_title("Kennedy WordCloud")
ax2.set_axis_off()
plt.tight_layout()
# Scattertext
# Scattertext of the authors in political
# The word usage of Nixon and Kennedy is quite different
IFrame(src='./scattertext_nixon_kennedy.html', width=950, height=500)
# Lime
# All words have a very small impact
HTML(filename='./lime_pol.html')
# Anchors
# No rule found
HTML(filename='./anchor_pol.html')
# Major Class
def get_major_acc(x, classes=['unrelated', 'attack/disagreement', 'support/agreement']):
return np.divide(x[classes].max(), np.sum(x[classes]))
def get_major_class(x, classes=['unrelated', 'attack/disagreement', 'support/agreement']):
return x[classes].astype('float64').idxmax()
data_stats_total['major_acc'] = data_stats_total.apply(get_major_acc, axis=1)
data_stats_total['major_class'] = data_stats_total.apply(get_major_class, axis=1)
data_stats_total.loc[data_stats_total['dataset'].isin(['debate_test', 'political'])][['dataset', 'major_class', 'major_acc']]
# Major Class per Topic node
data = data_stats_topic['debate_test']
data['major_acc'] = data.apply(get_major_acc, args=[['attack','support']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['attack','support']], axis=1)
data[['topic', 'major_class', 'major_acc', 'tot']]
# Major Class per Topic political
data = data_stats_topic['political']
data['major_acc'] = data.apply(get_major_acc, args=[['attack','support', 'unrelated']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['attack','support', 'unrelated']], axis=1)
data[['topic', 'major_class', 'major_acc', 'tot']]
# Major Class per Topic political attack/support only
data = data_stats_topic['political']
data['major_acc'] = data.apply(get_major_acc, args=[['attack','support']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['attack','support']], axis=1)
data[['topic', 'major_class', 'major_acc', 'tot']]
# Major Class Author identified
data = data_stats_author.copy()
data['major_acc'] = data.apply(get_major_acc, args=[['attack','support', 'unrelated']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['attack','support', 'unrelated']], axis=1)
data[['author_resp', 'author_org', 'major_class', 'major_acc', 'tot']]
# Major Class Author identified attack/support only
data = data_stats_author.copy()
data['major_acc'] = data.apply(get_major_acc, args=[['attack','support']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['attack','support']], axis=1)
data[['author_resp', 'author_org', 'major_class', 'major_acc', 'tot']]
# Merged to same author / different author
# Very high accuracy possible if only detected if it is the same or a different author
data = data_stats_author.iloc[:-1].copy()
data['authors'] = data.apply(lambda r: 'Same' if r['author_resp'] == r['author_org'] else 'Different', axis=1)
data = data.groupby('authors').sum()
data = data.reset_index()
data['major_acc'] = data.apply(get_major_acc, args=[['attack','support']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['attack','support']], axis=1)
data[['authors', 'major_class', 'major_acc', 'tot']]
# Sentiment Analysis (nltk vader)
# Only responses debate test, supporting arguments often have a positive sentiment
# Attacking arguments have nothing special
pd.concat((data_stats_sent['respdebate_test'],data_stats_sent['resppolitical']), keys=['node', 'political'], sort=True)
# Both org and response
# Attack often have different sentiment, support often have the same sentiment (node)
# Nothing meaningful for political
pd.concat((data_stats_sent['bothdebate_test'],data_stats_sent['bothpolitical']), keys=['node', 'political'], sort=True)
# .... ?
# Major Class for every Org argument
# Major Class for every Resp argument (only political)

# Node Acc with different parameters
# Fixed: input=both, seq_len=128, warmup_prop=0.1, seed=42
# Tested: model=base-uncased,large-uncased, epochs=3,4,5, batch_size=8,12,16, lr=2e-5, 3e-5, 5e-5
# Gradient accumulation: batch_size/4 for bert_large
# (in principle equivalent, in practice different because of rounding errors etc.)
eval_results = pd.read_csv('../pytorch/node_both/eval_results.tsv', sep='\t')
# Some stats: mean, min, max, std
# Paper acc 0.67, best bert acc 0.74, mean (bert-base) 0.62 , baselines ~0.6
print(eval_results['acc'].agg([np.mean, np.min, np.max, np.std]))
# Somehow bert-large performs worse than bert-base
print(eval_results.groupby('_bert-model')['acc'].agg([np.mean, np.min, np.max, np.std]))
print()
# Print settings of best result
print(eval_results.iloc[eval_results['acc'].idxmax()])
# Show the table
eval_results.head()

# Political F1 CrossVal
# Fixed: input=both, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=5, batch_size=12, lr=2e-5
# Related/Unrelated
eval_results = pd.read_csv('../pytorch/pol_ru/eval_results.tsv', sep='\t')
# Some stats: mean, min, max, std
# Paper average F1 0.65, here average F1 0.68, baseline ?
print(eval_results['f1'].agg([np.mean, np.min, np.max, np.std]))
eval_results.head()
# Political F1 CrossVal
# Fixed: input=both, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=5, batch_size=12, lr=2e-5
# Attack/Support
eval_results = pd.read_csv('../pytorch/pol_as/eval_results.tsv', sep='\t')
# Some stats: mean, min, max, std
# Paper average F1 0.82, here average F1 0.73, baselines (author) ~0.85
print(eval_results['f1'].agg([np.mean, np.min, np.max, np.std]))
eval_results.head()
# Political F1 CrossVal
# Fixed: input=both, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=5, batch_size=12, lr=2e-5
# Attack/Support/Unrelated
eval_results = pd.read_csv('../pytorch/pol_asu/eval_results.tsv', sep='\t')
# Some stats: mean, min, max, std
# Paper only reported precision 0.57, here average f1 0.60
# Use some tricks to coope with class imbalance!
print(eval_results['f1'].agg([np.mean, np.min, np.max, np.std]))
eval_results.head()
# Agreement F1 CrossVal
# Comparison with Paper + Baselines
# Fixed: input=both, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=2, batch_size=12, lr=2e-5
# Agreement/Disagreement
eval_results = pd.read_csv('../pytorch/agreement/eval_results.tsv', sep='\t')
# Some stats: mean, min, max, std
# Paper average acc 0.74 , here average acc 0.61
# TODO: non cross_val version had acc ~0.97! Probably parameters are bad, 2 Epochs might not be enough
# (try again with higher epochs number)
print(eval_results['acc'].agg([np.mean, np.min, np.max, np.std]))
eval_results.head()
# Import the train/test splits functions
import sys
import os
# TODO: use a relative path or a module instead
sys.path.append(os.path.abspath("/media/jannis/GeDaTS/SS19/BA/Code_BA/code_relation_prediction/pytorch"))
from run_classifier_dataset_utils import processors
node_pro = processors['node']('both')
political_as_pro = processors['political-as']('both')
# Node results with respect to topic
_, node_test_df = node_pro.get_dev_examples('../data')
eval_preds = pd.read_csv('../pytorch/node_both/eval_preds.csv')
# Only predictions from bert-base
res = pd.concat([node_test_df.reset_index(drop=True), eval_preds.iloc[27:,:-1].transpose().reset_index(drop=True)], axis=1)
res = res.replace({0: 'attack', 1: 'support'})
# For now, only one run (run 51) used
# There are errors in every topic, no clear trend visible that some topics are better or worse
# More false classifications of attack than of support (support is the major class)
# Could, also look at several runs, or average, etc.
pd.crosstab(res['topic'], [res['label'],res[51]])
# Take the rounded mean prediction for all bert-base runs
res['mean_round'] = eval_preds.iloc[27:,:-1].mean().round().values
res = res.replace({0: 'attack', 1: 'support'})
pd.crosstab(res['topic'], [res['label'],res['mean_round']])
# We can recreate all metrics from the available data
# E.g. classification reports or confusion matrices
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_pred=eval_preds.iloc[51,:-1].replace({0: 'attack', 1: 'support'}), y_true=res['label']))
print(confusion_matrix(res['label'], eval_preds.iloc[51,:-1].replace({0: 'attack', 1: 'support'})))
# Political results with respect to topic
splits_data = political_as_pro.get_splits('../data')
# Get the test data and the test predictions
pol_test_df = pd.concat(np.array(splits_data)[:,3])
eval_preds = pd.read_csv('../pytorch/pol_as/eval_preds.csv')
pol_test_df['preds'] = eval_preds.iloc[:,:-1].stack().values
pol_test_df = pol_test_df.replace({0: 'attack', 1: 'support'})
pd.crosstab(pol_test_df['topic'], [pol_test_df['label'],pol_test_df['preds']])
# Political results with respect to author
pd.crosstab(pol_test_df['preds'], [pol_test_df['org_stance'],pol_test_df['response_stance']])
# Complete results political (all folds "summed")
print(classification_report(y_pred=pol_test_df['preds'], y_true=pol_test_df['label']))
print(confusion_matrix(y_pred=pol_test_df['preds'], y_true=pol_test_df['label']))
# Results with respect to same org, same resp (always gets the same label or not?)
# Same org
# One org does not always get the same prediction (but often)
pd.crosstab(res['org'], res[51])
# Same org pol
# TODO: aggregate to get some useful insights
# (and maybe do it for every fold individually,
# because otherwise it could be that we always predict one label for one org in one fold and another in another fold)
pd.crosstab(pol_test_df['org'], pol_test_df['preds']).head()
# Same resp pol
pd.crosstab(pol_test_df['response'], pol_test_df['preds']).head()
# Only org
# Only resp
# Train on one dataset, evaluate on another (without finetuning)
# With finetuning (reusing the classification layer)
# With finetuning + use a new classification layer